In [1]:
import plotly.express as px
primary_blue = "#496595"
primary_blue2 = "#85a1c1"
primary_blue3 = "#3f4d63"
primary_grey = "#c6ccd8"
primary_black = "#202022"
primary_bgcolor = "#f4f0ea"
primary_green = px.colors.qualitative.Plotly[2]
In [2]:
import pandas as pd
# Load the data
df = pd.read_csv('C:/Users/priya/Downloads/spam.csv', encoding='latin-1')
df = df.dropna(how="any", axis=1)
df.columns = ['target', 'message']
df.head()
Out[2]:
| target | message | |
|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... |
| 1 | ham | Ok lar... Joking wif u oni... |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... |
| 3 | ham | U dun say so early hor... U c already then say... |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... |
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5572 entries, 0 to 5571 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 target 5572 non-null object 1 message 5572 non-null object dtypes: object(2) memory usage: 87.2+ KB
In [4]:
df.isnull().sum()
Out[4]:
target 0 message 0 dtype: int64
In [5]:
df['message_len'] = df['message'].apply(lambda x: len(x.split(' ')))
df.head()
Out[5]:
| target | message | message_len | |
|---|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... | 20 |
| 1 | ham | Ok lar... Joking wif u oni... | 6 |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 28 |
| 3 | ham | U dun say so early hor... U c already then say... | 11 |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... | 13 |
In [6]:
max(df['message_len'])
Out[6]:
171
EDA¶
In [7]:
balance_counts = df.groupby('target')['target'].agg('count').values
balance_counts
Out[7]:
array([4825, 747], dtype=int64)
In [8]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Bar(
x=['ham'],
y=[balance_counts[0]],
name='ham',
text=[balance_counts[0]],
textposition='auto',
marker_color=primary_blue
))
fig.add_trace(go.Bar(
x=['spam'],
y=[balance_counts[1]],
name='spam',
text=[balance_counts[1]],
textposition='auto',
marker_color=primary_grey
))
fig.update_layout(
title='<span style="font-size:32px; font-family:Times New Roman">Dataset distribution by target</span>'
)
fig.show()
In [9]:
ham_df = df[df['target'] == 'ham']['message_len'].value_counts().sort_index()
spam_df = df[df['target'] == 'spam']['message_len'].value_counts().sort_index()
fig = go.Figure()
fig.add_trace(go.Scatter(
x=ham_df.index,
y=ham_df.values,
name='ham',
fill='tozeroy',
marker_color=primary_blue,
))
fig.add_trace(go.Scatter(
x=spam_df.index,
y=spam_df.values,
name='spam',
fill='tozeroy',
marker_color=primary_grey,
))
fig.update_layout(
title='<span style="font-size:32px; font-family:Times New Roman">Data Roles in Different Fields</span>'
)
fig.update_xaxes(range=[0, 70])
fig.show()
Data pre processing¶
In [10]:
import re
import pandas as pd
import string
# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
'''Make text lowercase, remove text in square brackets,remove links,remove punctuation
and remove words containing numbers.'''
text = str(text).lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
return text
In [11]:
df['message_clean'] = df['message'].apply(clean_text)
df.head()
Out[11]:
| target | message | message_len | message_clean | |
|---|---|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... | 20 | go until jurong point crazy available only in ... |
| 1 | ham | Ok lar... Joking wif u oni... | 6 | ok lar joking wif u oni |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 28 | free entry in a wkly comp to win fa cup final... |
| 3 | ham | U dun say so early hor... U c already then say... | 11 | u dun say so early hor u c already then say |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... | 13 | nah i dont think he goes to usf he lives aroun... |
In [12]:
pip install nltk
Requirement already satisfied: nltk in c:\users\priya\anaconda3\lib\site-packages (3.8.1) Requirement already satisfied: click in c:\users\priya\anaconda3\lib\site-packages (from nltk) (8.1.7) Requirement already satisfied: joblib in c:\users\priya\anaconda3\lib\site-packages (from nltk) (1.2.0) Requirement already satisfied: regex>=2021.8.3 in c:\users\priya\anaconda3\lib\site-packages (from nltk) (2023.10.3) Requirement already satisfied: tqdm in c:\users\priya\anaconda3\lib\site-packages (from nltk) (4.65.0) Requirement already satisfied: colorama in c:\users\priya\anaconda3\lib\site-packages (from click->nltk) (0.4.6) Note: you may need to restart the kernel to use updated packages.
In [13]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords') # This will download the stopwords dataset
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\priya\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
Out[13]:
True
In [14]:
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords
def remove_stopwords(text):
text = ' '.join(word for word in text.split(' ') if word not in stop_words)
return text
df['message_clean'] = df['message_clean'].apply(remove_stopwords)
df.head()
Out[14]:
| target | message | message_len | message_clean | |
|---|---|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... | 20 | go jurong point crazy available bugis n great ... |
| 1 | ham | Ok lar... Joking wif u oni... | 6 | ok lar joking wif oni |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 28 | free entry wkly comp win fa cup final tkts m... |
| 3 | ham | U dun say so early hor... U c already then say... | 11 | dun say early hor already say |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... | 13 | nah dont think goes usf lives around though |
Stemming¶
In [15]:
stemmer = nltk.SnowballStemmer("english")
def stemm_text(text):
text = ' '.join(stemmer.stem(word) for word in text.split(' '))
return text
In [16]:
df['message_clean'] = df['message_clean'].apply(stemm_text)
df.head()
Out[16]:
| target | message | message_len | message_clean | |
|---|---|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... | 20 | go jurong point crazi avail bugi n great world... |
| 1 | ham | Ok lar... Joking wif u oni... | 6 | ok lar joke wif oni |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 28 | free entri wkli comp win fa cup final tkts m... |
| 3 | ham | U dun say so early hor... U c already then say... | 11 | dun say earli hor alreadi say |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... | 13 | nah dont think goe usf live around though |
In [17]:
def preprocess_data(text):
# Clean puntuation, urls, and so on
text = clean_text(text)
# Remove stopwords
text = ' '.join(word for word in text.split(' ') if word not in stop_words)
# Stemm all the words in the sentence
text = ' '.join(stemmer.stem(word) for word in text.split(' '))
return text
In [18]:
df['message_clean'] = df['message_clean'].apply(preprocess_data)
df.head()
Out[18]:
| target | message | message_len | message_clean | |
|---|---|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... | 20 | go jurong point crazi avail bugi n great world... |
| 1 | ham | Ok lar... Joking wif u oni... | 6 | ok lar joke wif oni |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 28 | free entri wkli comp win fa cup final tkts m... |
| 3 | ham | U dun say so early hor... U c already then say... | 11 | dun say ear hor alreadi say |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... | 13 | nah dont think goe usf live around though |
Target encoding¶
In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['target'])
df['target_encoded'] = le.transform(df['target'])
df.head()
Out[19]:
| target | message | message_len | message_clean | target_encoded | |
|---|---|---|---|---|---|
| 0 | ham | Go until jurong point, crazy.. Available only ... | 20 | go jurong point crazi avail bugi n great world... | 0 |
| 1 | ham | Ok lar... Joking wif u oni... | 6 | ok lar joke wif oni | 0 |
| 2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... | 28 | free entri wkli comp win fa cup final tkts m... | 1 |
| 3 | ham | U dun say so early hor... U c already then say... | 11 | dun say ear hor alreadi say | 0 |
| 4 | ham | Nah I don't think he goes to usf, he lives aro... | 13 | nah dont think goe usf live around though | 0 |
token visualization¶
In [20]:
import numpy as np
In [21]:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# Path to the locally saved image
local_image_path = 'C:/Users/priya/Downloads/icons8-twitter-64.png' # Update this path to where you saved the image
# Load the image into a numpy array
img = Image.open(local_image_path)
twitter_mask = np.array(img)
# Assuming 'df' and its column 'message_clean' are defined appropriately
wc = WordCloud(
background_color='white',
max_words=200,
mask=twitter_mask
)
wc.generate(' '.join(text for text in df.loc[df['target'] == 'ham', 'message_clean']))
# Display the WordCloud
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
In [22]:
wc = WordCloud(
background_color='white',
max_words=200,
mask=twitter_mask,
)
wc.generate(' '.join(text for text in df.loc[df['target'] == 'spam', 'message_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for SPAM messages',
fontdict={'size': 22, 'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()
vectorization¶
In [23]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
x = df['message_clean']
y = df['target_encoded']
print(len(x), len(y))
5572 5572
In [24]:
# Split into train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))
4179 4179 1393 1393
In [25]:
from sklearn.feature_extraction.text import CountVectorizer
# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(x_train)
Out[25]:
CountVectorizer()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CountVectorizer()
In [26]:
# Use the trained to create a document-term matrix from train and test sets
x_train_dtm = vect.transform(x_train)
x_test_dtm = vect.transform(x_test)
5.1 tunning count vectorizer
In [27]:
vect_tunned = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.1, max_df=0.7, max_features=100)
5.2 TF - IDF
In [28]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(x_train_dtm)
x_train_tfidf = tfidf_transformer.transform(x_train_dtm)
x_train_tfidf
Out[28]:
<4179x5684 sparse matrix of type '<class 'numpy.float64'>' with 32201 stored elements in Compressed Sparse Row format>
Glove
In [29]:
texts = df['message_clean']
target = df['target_encoded']
In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer
# Calculate the length of our vocabulary
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(texts)
vocab_length = len(word_tokenizer.word_index) + 1
vocab_length
Out[30]:
6726
In [31]:
import tensorflow as tf
sequences = [[1], [2, 3], [4, 5, 6]]
tf.keras.preprocessing.sequence.pad_sequences(
sequences, maxlen=None, dtype='int32', padding='pre',
truncating='pre', value=0.0
)
Out[31]:
array([[0, 0, 1],
[0, 2, 3],
[4, 5, 6]])
In [32]:
tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post')
Out[32]:
array([[1, 0, 0],
[2, 3, 0],
[4, 5, 6]])
In [33]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Initialize the Tokenizer and fit it on the texts
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(texts)
def embed(corpus):
return word_tokenizer.texts_to_sequences(corpus)
# Find the longest sentence by token length
longest_train = max(texts, key=lambda sentence: len(embed([sentence])[0]))
length_long_sentence = len(embed([longest_train])[0])
# Pad the sequences
train_padded_sentences = pad_sequences(
embed(texts),
length_long_sentence,
padding='post'
)
print("Padded Sentences:\n", train_padded_sentences)
Padded Sentences: [[ 2 3179 274 ... 0 0 0] [ 8 236 527 ... 0 0 0] [ 9 356 588 ... 0 0 0] ... [6724 1002 6725 ... 0 0 0] [ 138 1251 1603 ... 0 0 0] [1986 378 170 ... 0 0 0]]
Glove
In [34]:
embeddings_dictionary = dict()
embedding_dim = 100
# Load GloVe 100D embeddings
glove_path = 'C:/Users/priya/Downloads/glove.6B.100d.txt/glove.6B.100d.txt'
embeddings_index = {}
with open(glove_path, 'r', encoding='utf-8') as fp:
for line in fp.readlines():
records = line.split()
word = records[0]
vector = np.array(records[1:], dtype='float32')
embeddings_index[word] = vector
print("Loaded {} word vectors.".format(len(embeddings_index)))
Loaded 400000 word vectors.
In [35]:
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in word_tokenizer.word_index.items():
embedding_vector = embeddings_dictionary.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
embedding_matrix
Out[35]:
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
In [36]:
import plotly.figure_factory as ff
x_axes = ['Ham', 'Spam']
y_axes = ['Spam', 'Ham']
def conf_matrix(z, x=x_axes, y=y_axes):
z = np.flip(z, 0)
# change each element of z to type string for annotations
z_text = [[str(y) for y in x] for x in z]
# set up figure
fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='Viridis')
# add title
fig.update_layout(title_text='<b>Confusion matrix</b>',
xaxis = dict(title='Predicted value'),
yaxis = dict(title='Real value')
)
# add colorbar
fig['data'][0]['showscale'] = True
return fig
In [37]:
# Create a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
# Train the model
nb.fit(x_train_dtm, y_train)
Out[37]:
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()
Navie Bayes DTM
In [38]:
# Make class anf probability predictions
y_pred_class = nb.predict(x_test_dtm)
y_pred_prob = nb.predict_proba(x_test_dtm)[:, 1]
In [39]:
# calculate accuracy of class predictions
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))
conf_matrix(metrics.confusion_matrix(y_test, y_pred_class))
0.9784637473079684
In [40]:
# Calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)
Out[40]:
0.974296765425861
Navies Bayes
In [41]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
pipe = Pipeline([('bow', CountVectorizer()),
('tfid', TfidfTransformer()),
('model', MultinomialNB())])
In [42]:
# Fit the pipeline with the data
pipe.fit(x_train, y_train)
y_pred_class = pipe.predict(x_test)
print(metrics.accuracy_score(y_test, y_pred_class))
conf_matrix(metrics.confusion_matrix(y_test, y_pred_class))
0.9597989949748744
XG Boost
In [43]:
import xgboost as xgb
pipe = Pipeline([
('bow', CountVectorizer()),
('tfid', TfidfTransformer()),
('model', xgb.XGBClassifier(
learning_rate=0.1,
max_depth=7,
n_estimators=80,
use_label_encoder=False,
eval_metric='auc',
# colsample_bytree=0.8,
# subsample=0.7,
# min_child_weight=5,
))
])
In [44]:
# Fit the pipeline with the data
pipe.fit(x_train, y_train)
y_pred_class = pipe.predict(x_test)
y_pred_train = pipe.predict(x_train)
print('Train: {}'.format(metrics.accuracy_score(y_train, y_pred_train)))
print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))
conf_matrix(metrics.confusion_matrix(y_test, y_pred_class))
Train: 0.9830102895429529 Test: 0.9641062455132807
Lstm¶
In [45]:
print("Length of train_padded_sentences:", len(train_padded_sentences))
print("Length of target:", len(target))
Length of train_padded_sentences: 5572 Length of target: 5572
In [46]:
print("Type of embedding_matrix:", type(embedding_matrix))
print("Shape of embedding_matrix:", embedding_matrix.shape)
print("Type of length_long_sentence:", type(length_long_sentence))
print("Value of length_long_sentence:", length_long_sentence)
Type of embedding_matrix: <class 'numpy.ndarray'> Shape of embedding_matrix: (6726, 100) Type of length_long_sentence: <class 'int'> Value of length_long_sentence: 80
In [47]:
from tensorflow.keras.layers import Embedding
# Minimal Embedding example
embedding_layer = Embedding(input_dim=6726, output_dim=100)
In [48]:
import tensorflow as tf
print(tf.__version__)
2.16.1
In [49]:
print(type(embedding_matrix))
print(embedding_matrix.shape)
<class 'numpy.ndarray'> (6726, 100)
In [50]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
try:
model = Sequential([
Embedding(input_dim=6726, output_dim=100) # Leave out input_length
])
print("Embedding layer initialized successfully without input_length.")
except Exception as e:
print("Failed to initialize Embedding layer without input_length:", str(e))
Embedding layer initialized successfully without input_length.
In [51]:
from tensorflow.keras.layers import Input, Embedding
from tensorflow.keras.models import Model
input_layer = Input(shape=(8,)) # Directly specify input shape here
embedding_layer = Embedding(input_dim=6726, output_dim=100)(input_layer) # Without input_length
model = Model(inputs=input_layer, outputs=embedding_layer)
try:
model.summary()
print("Model with Embedding layer initialized successfully.")
except Exception as e:
print("Error in model setup:", str(e))
Model: "functional_1"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ input_layer (InputLayer) │ (None, 8) │ 0 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ embedding_2 (Embedding) │ (None, 8, 100) │ 672,600 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 672,600 (2.57 MB)
Trainable params: 672,600 (2.57 MB)
Non-trainable params: 0 (0.00 B)
Model with Embedding layer initialized successfully.
In [52]:
try:
# Test embedding layer creation in isolation
embedding_layer = Embedding(input_dim=6726, output_dim=100)
print("Embedding layer standalone initialized successfully.")
except Exception as e:
print("Embedding layer standalone initialization failed:", str(e))
Embedding layer standalone initialized successfully.
NLP¶
In [53]:
df = pd.read_csv('C:/Users/priya/Downloads/train.csv', encoding="latin-1")
test_df = pd.read_csv('C:/Users/priya/Downloads/test.csv', encoding="latin-1")
df = df.dropna(how="any", axis=1)
df['text_len'] = df['text'].apply(lambda x: len(x.split(' ')))
df.head()
Out[53]:
| id | text | target | text_len | |
|---|---|---|---|---|
| 0 | 1 | Our Deeds are the Reason of this #earthquake M... | 1 | 13 |
| 1 | 4 | Forest fire near La Ronge Sask. Canada | 1 | 7 |
| 2 | 5 | All residents asked to 'shelter in place' are ... | 1 | 22 |
| 3 | 6 | 13,000 people receive #wildfires evacuation or... | 1 | 9 |
| 4 | 7 | Just got sent this photo from Ruby #Alaska as ... | 1 | 17 |
eda¶
In [54]:
balance_counts = df.groupby('target')['target'].agg('count').values
balance_counts
Out[54]:
array([4342, 3271], dtype=int64)
In [55]:
fig = go.Figure()
fig.add_trace(go.Bar(
x=['Fake'],
y=[balance_counts[0]],
name='Fake',
text=[balance_counts[0]],
textposition='auto',
marker_color=primary_blue
))
fig.add_trace(go.Bar(
x=['Real disaster'],
y=[balance_counts[1]],
name='Real disaster',
text=[balance_counts[1]],
textposition='auto',
marker_color=primary_grey
))
fig.update_layout(
title='<span style="font-size:32px; font-family:Times New Roman">Dataset distribution by target</span>'
)
fig.show()
In [56]:
disaster_df = df[df['target'] == 1]['text_len'].value_counts().sort_index()
fake_df = df[df['target'] == 0]['text_len'].value_counts().sort_index()
fig = go.Figure()
fig.add_trace(go.Scatter(
x=disaster_df.index,
y=disaster_df.values,
name='Real disaster',
fill='tozeroy',
marker_color=primary_blue,
))
fig.add_trace(go.Scatter(
x=fake_df.index,
y=fake_df.values,
name='Fake',
fill='tozeroy',
marker_color=primary_grey,
))
fig.update_layout(
title='<span style="font-size:32px; font-family:Times New Roman">Data Roles in Different Fields</span>'
)
fig.show()
data preprocessing¶
In [57]:
def remove_url(text):
url = re.compile(r'https?://\S+|www\.\S+')
return url.sub(r'', text)
def remove_emoji(text):
emoji_pattern = re.compile(
'['
u'\U0001F600-\U0001F64F' # emoticons
u'\U0001F300-\U0001F5FF' # symbols & pictographs
u'\U0001F680-\U0001F6FF' # transport & map symbols
u'\U0001F1E0-\U0001F1FF' # flags (iOS)
u'\U00002702-\U000027B0'
u'\U000024C2-\U0001F251'
']+',
flags=re.UNICODE)
return emoji_pattern.sub(r'', text)
def remove_html(text):
html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
return re.sub(html, '', text)
# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
'''Make text lowercase, remove text in square brackets,remove links,remove punctuation
and remove words containing numbers.'''
text = str(text).lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub(
'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
'',
text
)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
text = remove_url(text)
text = remove_emoji(text)
text = remove_html(text)
return text
In [58]:
# Test emoji removal
remove_emoji("Omg another Earthquake 😔😔")
Out[58]:
'Omg another Earthquake '
In [59]:
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords
stemmer = nltk.SnowballStemmer("english")
def preprocess_data(text):
# Clean puntuation, urls, and so on
text = clean_text(text)
# Remove stopwords and Stemm all the words in the sentence
text = ' '.join(stemmer.stem(word) for word in text.split(' ') if word not in stop_words)
return text
In [60]:
test_df['text_clean'] = test_df['text'].apply(preprocess_data)
df['text_clean'] = df['text'].apply(preprocess_data)
df.head()
Out[60]:
| id | text | target | text_len | text_clean | |
|---|---|---|---|---|---|
| 0 | 1 | Our Deeds are the Reason of this #earthquake M... | 1 | 13 | deed reason earthquak may allah forgiv us |
| 1 | 4 | Forest fire near La Ronge Sask. Canada | 1 | 7 | forest fire near la rong sask canada |
| 2 | 5 | All residents asked to 'shelter in place' are ... | 1 | 22 | resid ask shelter place notifi offic evacu she... |
| 3 | 6 | 13,000 people receive #wildfires evacuation or... | 1 | 9 | peopl receiv wildfir evacu order california |
| 4 | 7 | Just got sent this photo from Ruby #Alaska as ... | 1 | 17 | got sent photo rubi alaska smoke wildfir pour ... |
wordcloud¶
In [61]:
def create_corpus_df(tweet, target):
corpus=[]
for x in tweet[tweet['target']==target]['text_clean'].str.split():
for i in x:
corpus.append(i)
return corpus
In [62]:
from collections import defaultdict
corpus_disaster_tweets = create_corpus_df(df, 1) # Assuming create_corpus_df and df are defined elsewhere
dic = defaultdict(int)
for word in corpus_disaster_tweets:
dic[word] += 1
top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]
print(top)
[('fire', 266), ('bomb', 179), ('kill', 158), ('news', 132), ('via', 121), ('flood', 120), ('disast', 116), ('california', 115), ('crash', 110), ('suicid', 110)]
In [63]:
local_image_path = 'C:/Users/priya/Downloads/icons8-twitter-64.png' # Update this path to where you saved the image
# Load the image into a numpy array
img = Image.open(local_image_path)
twitter_mask = np.array(img)
wc = WordCloud(
background_color='white',
max_words=200,
mask=twitter_mask,
)
wc.generate(' '.join(text for text in df.loc[df['target'] == 1, 'text_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for Real Disaster tweets',
fontdict={'size': 22, 'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()
In [64]:
corpus_disaster_tweets = create_corpus_df(df, 0)
dic=defaultdict(int)
for word in corpus_disaster_tweets:
dic[word]+=1
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10]
top
Out[64]:
[('like', 306),
('get', 222),
('amp', 192),
('new', 168),
('go', 142),
('dont', 139),
('one', 134),
('bodi', 116),
('love', 115),
('bag', 108)]
In [65]:
wc = WordCloud(
background_color='white',
max_words=200,
mask=twitter_mask,
)
wc.generate(' '.join(text for text in df.loc[df['target'] == 0, 'text_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for Fake messages',
fontdict={'size': 22, 'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()
modeling¶
In [66]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
x = df['text_clean']
y = df['target']
# Split into train and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))
5709 5709 1904 1904
In [67]:
pipe = Pipeline([
('bow', CountVectorizer()),
('tfid', TfidfTransformer()),
('model', xgb.XGBClassifier(
use_label_encoder=False,
eval_metric='auc',
))
])
from sklearn import metrics
# Fit the pipeline with the data
pipe.fit(x_train, y_train)
y_pred_class = pipe.predict(x_test)
y_pred_train = pipe.predict(x_train)
print('Train: {}'.format(metrics.accuracy_score(y_train, y_pred_train)))
print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))
conf_matrix(metrics.confusion_matrix(y_test, y_pred_class))
Train: 0.861096514275705 Test: 0.773109243697479
Glove-LSTM¶
In [68]:
train_tweets = df['text_clean'].values
test_tweets = test_df['text_clean'].values
train_target = df['target'].values
In [69]:
# Calculate the length of our vocabulary
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(train_tweets)
vocab_length = len(word_tokenizer.word_index) + 1
vocab_length
Out[69]:
13704
In [70]:
def show_metrics(pred_tag, y_test):
print("F1-score: ", f1_score(pred_tag, y_test))
print("Precision: ", precision_score(pred_tag, y_test))
print("Recall: ", recall_score(pred_tag, y_test))
print("Acuracy: ", accuracy_score(pred_tag, y_test))
print("-"*50)
print(classification_report(pred_tag, y_test))
def embed(corpus):
return word_tokenizer.texts_to_sequences(corpus)
In [71]:
from tensorflow.keras.preprocessing.text import Tokenizer
# Assuming train_tweets and test_tweets are your datasets
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_tweets) # Fit the tokenizer on training data
In [72]:
train_sequences = tokenizer.texts_to_sequences(train_tweets)
test_sequences = tokenizer.texts_to_sequences(test_tweets)
In [73]:
longest_train = max(train_sequences, key=len) # Find the longest sequence
length_long_sentence = len(longest_train) # Get the length of the longest sequence
In [74]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_padded_sentences = pad_sequences(train_sequences, maxlen=length_long_sentence, padding='post')
test_padded_sentences = pad_sequences(test_sequences, maxlen=length_long_sentence, padding='post')
In [75]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_tweets)
# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_tweets)
test_sequences = tokenizer.texts_to_sequences(test_tweets)
# Find the longest sequence and its length
longest_train = max(train_sequences, key=len)
length_long_sentence = len(longest_train)
# Pad sequences
train_padded_sentences = pad_sequences(train_sequences, maxlen=length_long_sentence, padding='post')
test_padded_sentences = pad_sequences(test_sequences, maxlen=length_long_sentence, padding='post')
print(train_padded_sentences) # Print the padded training sentences
[[3635 467 201 ... 0 0 0] [ 136 2 106 ... 0 0 0] [1338 502 1807 ... 0 0 0] ... [ 448 1328 0 ... 0 0 0] [ 28 162 2637 ... 0 0 0] [ 171 31 413 ... 0 0 0]]
Glove¶
In [76]:
embedding_matrix = np.zeros((vocab_length, embedding_dim))
for word, index in word_tokenizer.word_index.items():
embedding_vector = embeddings_dictionary.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
embedding_matrix
Out[76]:
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
In [77]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
train_padded_sentences,
train_target,
test_size=0.25
)
In [81]:
from tensorflow.keras.layers import Embedding
try:
# Test with only required parameters
embedding_layer = Embedding(input_dim=6726, output_dim=100)
print("Basic Embedding layer initialized successfully.")
except Exception as e:
print("Failed at basic Embedding initialization:", str(e))
Basic Embedding layer initialized successfully.
In [82]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, GlobalMaxPool1D, Dense, Dropout, BatchNormalization
def glove_lstm():
try:
model = Sequential()
model.add(Embedding(input_dim=6726, output_dim=100, input_length=80, weights=[np.zeros((6726, 100))], trainable=False))
model.add(Bidirectional(LSTM(80, return_sequences=True, recurrent_dropout=0.2)))
model.add(GlobalMaxPool1D())
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(80, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(80, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
return model
except Exception as e:
print(f"An error occurred while creating the model: {str(e)}")
return None
In [91]:
import matplotlib.pyplot as plt
def plot_learning_curves(history, metrics):
plt.figure(figsize=(12, 6))
for subplot, metric_list in enumerate(metrics, 1):
plt.subplot(1, len(metrics), subplot)
for metric in metric_list:
plt.plot(history.history[metric], label=metric)
plt.title(' and '.join(metric_list).title())
plt.xlabel('Epochs')
plt.ylabel('Value')
plt.legend()
plt.show()
In [84]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
model = Sequential([
Dense(10, activation='relu', input_shape=(10,)),
Dense(1, activation='sigmoid')
])
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
# Simulating training data
import numpy as np
X_train = np.random.random((100, 10))
y_train = np.random.randint(2, size=(100, 1))
X_test = np.random.random((20, 10))
y_test = np.random.randint(2, size=(20, 1))
# Training the model
history = model.fit(
X_train,
y_train,
epochs=10,
batch_size=10,
validation_data=(X_test, y_test)
)
Epoch 1/10
C:\Users\priya\anaconda3\Lib\site-packages\keras\src\layers\core\dense.py:88: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
10/10 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step - accuracy: 0.5381 - loss: 0.6791 - val_accuracy: 0.3500 - val_loss: 0.8526 Epoch 2/10 10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - accuracy: 0.6358 - loss: 0.6457 - val_accuracy: 0.3500 - val_loss: 0.8490 Epoch 3/10 10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.5480 - loss: 0.7108 - val_accuracy: 0.4000 - val_loss: 0.8473 Epoch 4/10 10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6754 - loss: 0.6538 - val_accuracy: 0.4000 - val_loss: 0.8467 Epoch 5/10 10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.5631 - loss: 0.6921 - val_accuracy: 0.4000 - val_loss: 0.8471 Epoch 6/10 10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.6259 - loss: 0.6683 - val_accuracy: 0.4000 - val_loss: 0.8442 Epoch 7/10 10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.6993 - loss: 0.6263 - val_accuracy: 0.4000 - val_loss: 0.8433 Epoch 8/10 10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.6444 - loss: 0.6558 - val_accuracy: 0.4500 - val_loss: 0.8410 Epoch 9/10 10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.6533 - loss: 0.6457 - val_accuracy: 0.4500 - val_loss: 0.8397 Epoch 10/10 10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.5741 - loss: 0.6572 - val_accuracy: 0.4500 - val_loss: 0.8397
In [88]:
from sklearn.metrics import f1_score, precision_score, recall_score
def show_metrics(pred_tag, y_test):
print("F1-score: ", f1_score(y_test, pred_tag, average='macro'))
print("Precision: ", precision_score(y_test, pred_tag, average='macro'))
print("Recall: ", recall_score(y_test, pred_tag, average='macro'))
show_metrics(pred_labels, y_test)
F1-score: 0.28571428571428575 Precision: 0.2 Recall: 0.5
C:\Users\priya\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
In [89]:
plot_learning_curves(history, [['loss', 'val_loss'], ['accuracy', 'val_accuracy']])
In [90]:
import numpy as np
# Predict probabilities for each class
preds = model.predict(X_test)
# Convert probabilities to class labels
pred_labels = np.argmax(preds, axis=1)
# Show metrics
show_metrics(pred_labels, y_test)
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 31ms/step F1-score: 0.28571428571428575 Precision: 0.2 Recall: 0.5
C:\Users\priya\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
In [ ]:
In [ ]: